library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggrepel)
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
font_add_google("Lato", "lato")
showtext_auto()
babynames <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-22/babynames.csv')
## Rows: 1924665 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, name
## dbl (3): year, n, prop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
babynames_letter <- babynames %>% 
  mutate(name = tolower(name)) %>% 
  group_by(year, sex, name, n) %>% 
  summarise(letter = unlist(str_split(name, '')))
## `summarise()` has grouped output by 'year', 'sex', 'name', 'n'. You can override using the `.groups` argument.
babynames_letter %>% 
  group_by(sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  ggplot() +
  geom_col(aes(letter, prop, fill = sex), position = 'dodge')
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.

plot_dat <- babynames_letter %>% 
  mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>% 
  group_by(year, sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(year, sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  filter((letter %in% c('a', 'e', 'i', 'o', 'u')))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>% 
  group_by(sex, letter) %>% 
  filter(year == max(year) | year == min(year)) %>% 
  mutate(year = ifelse(year == max(year), year+2, year-2))

plot_dat %>% 
  ggplot()  +
  geom_line(aes(year, prop, color = sex, group = interaction(sex, letter))) +
  geom_text(aes(year, prop, color = sex, group = letter, label = letter),
            data = lab_let) +
  scale_y_sqrt() +
  theme_few() 

set.seed(1990)

plot_dat <- babynames_letter %>% 
  mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>% 
  group_by(year, sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(year, sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  # filter(!(letter %in% c('a', 'e', 'i', 'o', 'u'))) %>% 
  mutate(letter = letter %>% toupper()) %>% 
  ungroup() %>% 
  mutate(letter = factor(letter, sample(LETTERS)))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>% 
  group_by(sex, letter) %>% 
  filter(year == max(year) | year == min(year)) %>% 
  mutate(year = ifelse(year == max(year), year, year))

plot_dat %>% 
  ggplot()  +
  geom_line(aes(year, prop, color = letter,
                group = interaction(sex, letter))) +
  geom_text_repel(aes(year, prop, color = letter, label = letter),
            data = filter(lab_let, year == max(year)), 
            direction = "y", hjust = "left", nudge_x = 30,
            max.overlaps = Inf, min.segment.length = 0,
            segment.color = 'black', seed = 0, size = 4) +
  geom_text_repel(aes(year, prop, color = letter, label = letter),
            data = filter(lab_let, year == min(year)), 
            direction = "y", hjust = "left", nudge_x = -30,
            max.overlaps = Inf, min.segment.length = 0,
            segment.color = 'black', seed = 1, size = 4) +
  scale_y_sqrt(
    breaks = c(0.001, 0.007, seq(0.02, 0.2, 0.02)),
    sec.axis = dup_axis()
  ) +
  scale_x_continuous(
    expand = expansion(mult = 0.3),
    breaks = seq(1880, 2020, 20)
  ) +
  theme_few() +
  facet_wrap(~sex) +
  theme(
    legend.position = 'none'  
  )+
  ylab('Proportion') +
  xlab('Year') +
  labs(
    title = 'The popularity of the letters used in newborn baby names has changed over the years',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  )+
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 

ggsave('final_graph.pdf', height = 5, width = 12)
plt_fun <- function(highlight) {
  
  plot_dat %>% 
    ggplot()  +
    geom_line(aes(year, prop, color = letter, 
                  size = letter %in% highlight,
                  alpha = letter %in% highlight,
                  group = interaction(sex, letter))) +
    geom_text_repel(aes(year, prop, color = letter, label = letter,
                        alpha = letter %in% highlight),
              data = filter(lab_let, year == max(year)), 
              direction = "y", hjust = "left", nudge_x = 30,
              max.overlaps = Inf, min.segment.length = 0,
              segment.color = 'black', seed = 0, size = 4
              ) +
    geom_text_repel(aes(year, prop, color = letter, label = letter,
                        alpha = letter %in% highlight),
              data = filter(lab_let, year == min(year)), 
              direction = "y", hjust = "left", nudge_x = -30,
              max.overlaps = Inf, min.segment.length = 0,
              segment.color = 'black', seed = 1, size = 4) +
    scale_y_sqrt(
      breaks = c(0, 0.001, 0.007, seq(0.02, 0.2, 0.02)),
      sec.axis = dup_axis()
    ) +
    scale_x_continuous(
      expand = expansion(mult = 0.3),
      breaks = seq(1880, 2020, 20)
    ) +
    theme_few() +
    facet_wrap(~sex) +
    theme(
      legend.position = 'none'  
    ) +
    scale_size_discrete(range = c(0.5, 1)) +
    scale_alpha_discrete(range = c(0.3, 1)) +
    ylab('Proportion') +
    xlab('Year') 
}
plt_fun(c('A', 'E', 'I', 'O', 'U')) +
  labs(
    title = 'The vowels',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

ggsave('vowels.pdf', height = 5, width = 12)
babynames %>% 
  filter(year == '1950', sex == 'F', str_detect(tolower(name), 'k')) %>% 
  mutate(sum = sum(prop))
## # A tibble: 284 × 6
##     year sex   name          n     prop    sum
##    <dbl> <chr> <chr>     <dbl>    <dbl>  <dbl>
##  1  1950 F     Kathleen  25704 0.0146   0.0590
##  2  1950 F     Karen     24139 0.0137   0.0590
##  3  1950 F     Kathy      9089 0.00517  0.0590
##  4  1950 F     Kathryn    7864 0.00447  0.0590
##  5  1950 F     Katherine  6204 0.00353  0.0590
##  6  1950 F     Vicki      5827 0.00331  0.0590
##  7  1950 F     Vickie     3284 0.00187  0.0590
##  8  1950 F     Kay        2626 0.00149  0.0590
##  9  1950 F     Jackie     1276 0.000726 0.0590
## 10  1950 F     Kristine   1247 0.000709 0.0590
## # … with 274 more rows
babynames %>% 
  filter(year == '1950', sex == 'F') 
## # A tibble: 6,111 × 5
##     year sex   name         n   prop
##    <dbl> <chr> <chr>    <dbl>  <dbl>
##  1  1950 F     Linda    80432 0.0457
##  2  1950 F     Mary     65482 0.0372
##  3  1950 F     Patricia 47945 0.0273
##  4  1950 F     Barbara  41557 0.0236
##  5  1950 F     Susan    38018 0.0216
##  6  1950 F     Nancy    29618 0.0168
##  7  1950 F     Deborah  29064 0.0165
##  8  1950 F     Sandra   28895 0.0164
##  9  1950 F     Carol    26165 0.0149
## 10  1950 F     Kathleen 25704 0.0146
## # … with 6,101 more rows
plt_fun(c('F', 'S', 'O')) +
  labs(
    title = 'The rise and fall of letters',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

ggsave('peaks.pdf', height = 5, width = 12)
plt_fun(c('R', 'W')) +
  labs(
    title = 'Decreasing trends',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

ggsave('decreasing.pdf', height = 5, width = 12)
plt_fun(c('K', 'X', 'N')) +
  labs(
    title = 'The newbies',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

ggsave('increasing.pdf', height = 5, width = 12)